Extract, Transform, and Load

A Python 3.7 backend was developed to:

In a production environment:

#!/usr/bin/python3
import urllib.request
import pandas as pd
import io
import sqlite3
from datetime import datetime
from tqdm import tqdm
dataUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
labelUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
vendorUrl = "./data/vendordata.json"
# column names for sqlite
newCols = {
    "datetime": "MFG_DATE",
    "mat vendor": "MAT_VENDOR",
    "part vendor": "PART_VENDOR",
    "sil vendor": "SIL_VENDOR",
    "adhs vendor": "ADHS_VENDOR",
    "sop vendor": "SOP_VENDOR",
}
def dfUpload(df, con, table, timeStamp=True, clearTable=False, debug=False):
    if timeStamp:
        df['INSERTED_ON'] = datetime.now()
    df = df.where(pd.notnull(df), None)  # convert NaN to None, for SQL Nulls
    # just to fix pd.NaT to insert as NULLS
    for col in df.columns:
        if df[col].dtype.kind == 'M':
            df[col] = df[col].astype(object).where(df[col].notnull(), None)
            df[col] = df[col].dt.strftime('%Y-%m-%d %h:%m:%s')
    sqlColumns = '(' + ','.join([col for col in df.columns]) + ')'
    sqlValues = '(' + ','.join([':' + str(x + 1) for x in list(range(len(df.columns)))]) + ')'
    sqlInsert = "INSERT INTO %s %s VALUES %s" % (table, sqlColumns, sqlValues)
    crsr = con.cursor()
    # uploading
    if clearTable:
        crsr.execute("DELETE FROM %s" % table)
    for row in tqdm(df.values.tolist(), desc="Uploading data", unit="row"):
        if debug:
            try:
                crsr.executemany(sqlInsert, [row])
            except:
                print(row)
                pass
        else:
            crsr.executemany(sqlInsert, [row])
    con.commit()
    crsr.close()
def main():
    # tried pd.read_html(), but no tables found?
    def PandasFromUrl(url):
        return pd.read_csv(io.BytesIO(urllib.request.urlopen(url).read()),
                           encoding="utf8", sep=" ", header=None)
    print("Fetching data from web and formatting...")
    data = PandasFromUrl(dataUrl)
    data.columns = ["F" + str(i) for i in range(len(data.columns))]  # prefix feature columns with "F"
    data['PASS_FAIL'] = PandasFromUrl(labelUrl)[0]
    vendors = pd.read_json(vendorUrl).sort_index()
    df = data.merge(vendors, left_index=True, right_index=True)
    df.rename(index=str, columns=newCols, inplace=True)
    df['ID'] = list(range(len(df)))
    print("Connecting to Sqlite...")
    con = sqlite3.connect("warehouse.db")
    print("Clearing table and inserting records...")
    dfUpload(df, con, "SAMPLE", clearTable=True)
    print("Disconnecting from Sqlite...")
    con.close()
    print("Done!")
    
    
if __name__ == '__main__':
    main()
## Fetching data from web and formatting...
## Connecting to Sqlite...
## Clearing table and inserting records...
## Disconnecting from Sqlite...
## Done!
## 
## 
Uploading data:   0%|          | 0/1567 [00:00<?, ?row/s]
Uploading data:  34%|###3      | 527/1567 [00:00<00:00, 5246.73row/s]
Uploading data:  64%|######4   | 1003/1567 [00:00<00:00, 5085.13row/s]
Uploading data:  93%|#########2| 1455/1567 [00:00<00:00, 4673.84row/s]
Uploading data: 100%|##########| 1567/1567 [00:00<00:00, 4724.12row/s]

Prepare Environment

library(DBI)
library(dplyr)
library(broom)
library(ROCR)
library(extraTrees)
library(lubridate)
library(ggplot2)
library(plotly)

rm(list = ls()) # clear all data
try(invisible(dev.off()),silent=TRUE) # clear all plots

# helper function to print huge dataframe
printWideDataFrame <- function(df, n){
  head(df[c(1:n,(ncol(df)-n):ncol(df))])
}

Fetch from Database

Since the data has already been normalized into Sqlite, a SELECT statement can be used to pull the table into RAM.

In a production environment:

# connect to db, fetch table into RAM, disconnect from db
con <- dbConnect(RSQLite::SQLite(), "warehouse.db")
df_orig <- dbGetQuery(con, "select * from sample") %>%
  mutate(
    INSERTED_ON = as.POSIXct(INSERTED_ON),
    MFG_DATE = as.POSIXct(MFG_DATE),
    PASS_FAIL = ifelse(PASS_FAIL==1,0,1))
dbDisconnect(con)

# preview dataframe
printWideDataFrame(df_orig, 15)
##   ID         INSERTED_ON            MFG_DATE PASS_FAIL MAT_VENDOR
## 1  0 2019-01-16 20:40:12 2008-07-19 11:55:00         1        ddd
## 2  1 2019-01-16 20:40:12 2008-07-19 12:32:00         1        eee
## 3  2 2019-01-16 20:40:12 2008-07-19 13:17:00         0        fff
## 4  3 2019-01-16 20:40:12 2008-07-19 14:43:00         1        ccc
## 5  4 2019-01-16 20:40:12 2008-07-19 15:22:00         1        ccc
## 6  5 2019-01-16 20:40:12 2008-07-19 17:53:00         1        eee
##   PART_VENDOR SIL_VENDOR ADHS_VENDOR SOP_VENDOR      F0      F1       F2
## 1         aaa        ddd         bbb        eee 3030.93 2564.00 2187.733
## 2         ccc        ddd         aaa        aaa 3095.78 2465.14 2230.422
## 3         aaa        eee         aaa        jjj 2932.61 2559.94 2186.411
## 4         ccc        hhh         aaa        eee 2988.72 2479.90 2199.033
## 5         bbb        aaa         bbb        iii 3032.24 2502.87 2233.367
## 6         aaa        hhh         bbb        eee 2946.25 2432.84 2233.367
##          F3     F4  F5   F574   F575   F576    F577   F578   F579   F580
## 1 1411.1265 1.3602 100 3.0624 0.1026 1.6765 14.9509     NA     NA     NA
## 2 1463.6606 0.8294 100 2.0111 0.0772 1.1065 10.9003 0.0096 0.0201 0.0060
## 3 1698.0172 1.5102 100 4.0923 0.0640 2.0952  9.2721 0.0584 0.0484 0.0148
## 4  909.7926 1.3204 100 2.8971 0.0525 1.7585  8.5831 0.0202 0.0149 0.0044
## 5 1326.5200 1.5334 100 3.1776 0.0706 1.6597 10.9698     NA     NA     NA
## 6 1326.5200 1.5334 100 2.2598 0.0899 1.6679 13.7755 0.0342 0.0151 0.0052
##       F581   F582   F583   F584    F585   F586   F587   F588     F589
## 1       NA 0.5005 0.0118 0.0035  2.3630     NA     NA     NA       NA
## 2 208.2045 0.5019 0.0223 0.0055  4.4447 0.0096 0.0201 0.0060 208.2045
## 3  82.8602 0.4958 0.0157 0.0039  3.1745 0.0584 0.0484 0.0148  82.8602
## 4  73.8432 0.4990 0.0103 0.0025  2.0544 0.0202 0.0149 0.0044  73.8432
## 5       NA 0.4800 0.4766 0.1045 99.3032 0.0202 0.0149 0.0044  73.8432
## 6  44.0077 0.4949 0.0189 0.0044  3.8276 0.0342 0.0151 0.0052  44.0077

Clean Data

Using dplyr commands to:

# massage for statistics
df_stats <- df_orig %>% 
  fastDummies::dummy_cols() %>%  # add dummy variables for all string columns
  select(-c(ID, INSERTED_ON, MFG_DATE, MAT_VENDOR, PART_VENDOR, SIL_VENDOR, ADHS_VENDOR, SOP_VENDOR))  # drop columns

# preview dataframe
printWideDataFrame(df_stats, 15)
##   PASS_FAIL      F0      F1       F2        F3     F4  F5       F6     F7
## 1         1 3030.93 2564.00 2187.733 1411.1265 1.3602 100  97.6133 0.1242
## 2         1 3095.78 2465.14 2230.422 1463.6606 0.8294 100 102.3433 0.1247
## 3         0 2932.61 2559.94 2186.411 1698.0172 1.5102 100  95.4878 0.1241
## 4         1 2988.72 2479.90 2199.033  909.7926 1.3204 100 104.2367 0.1217
## 5         1 3032.24 2502.87 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## 6         1 2946.25 2432.84 2233.367 1326.5200 1.5334 100 100.3967 0.1235
##       F8      F9     F10    F11      F12 F13 SIL_VENDOR_ccc SIL_VENDOR_iii
## 1 1.5005  0.0162 -0.0034 0.9455 202.4396   0              0              0
## 2 1.4966 -0.0005 -0.0148 0.9627 200.5470   0              0              0
## 3 1.4436  0.0041  0.0013 0.9615 202.0179   0              0              0
## 4 1.4882 -0.0124 -0.0033 0.9629 201.8482   0              0              0
## 5 1.5031 -0.0031 -0.0072 0.9569 201.9424   0              0              0
## 6 1.5287  0.0167  0.0055 0.9699 200.4720   0              0              0
##   SIL_VENDOR_fff ADHS_VENDOR_bbb ADHS_VENDOR_aaa SOP_VENDOR_eee
## 1              0               1               0              1
## 2              0               0               1              0
## 3              0               0               1              0
## 4              0               0               1              1
## 5              0               1               0              0
## 6              0               1               0              1
##   SOP_VENDOR_aaa SOP_VENDOR_jjj SOP_VENDOR_iii SOP_VENDOR_ddd
## 1              0              0              0              0
## 2              1              0              0              0
## 3              0              1              0              0
## 4              0              0              0              0
## 5              0              0              1              0
## 6              0              0              0              0
##   SOP_VENDOR_ccc SOP_VENDOR_kkk SOP_VENDOR_hhh SOP_VENDOR_bbb
## 1              0              0              0              0
## 2              0              0              0              0
## 3              0              0              0              0
## 4              0              0              0              0
## 5              0              0              0              0
## 6              0              0              0              0
##   SOP_VENDOR_ggg SOP_VENDOR_fff
## 1              0              0
## 2              0              0
## 3              0              0
## 4              0              0
## 5              0              0
## 6              0              0

Exploratory Data Analysis

Using javascript wrappers, an html page can be used to show interactive volumes and yields over the given time period.

In a production environment, the UI could be a(n):

# mfg volume vs time
df_orig %>%
  group_by(MFG_DATE = floor_date(MFG_DATE, "day")) %>%
  summarize(
    QTY_PASS = sum(PASS_FAIL==1),
    QTY_FAIL = sum(PASS_FAIL==0)
    ) %>%
    plot_ly(x = ~MFG_DATE, y = ~QTY_PASS, name = "Pass", type = "bar") %>%
    add_trace(y = ~QTY_FAIL, name = 'Fail') %>%
    layout(
      xaxis=list(title='Manufacturing Date'),
      yaxis=list(title='Volume'),
      barmode="stack",
      title='Semi-Conductor Production Volume vs. Time')
# mfg volume vs time (cumulative)
df_orig %>%
  group_by(MFG_DATE = floor_date(MFG_DATE, "day")) %>%
  summarize(
    QTY = n(),
    QTY_PASS = sum(PASS_FAIL==1),
    QTY_FAIL = sum(PASS_FAIL==0)
  ) %>%
  mutate(
    QTY = cumsum(QTY),
    QTY_PASS = cumsum(QTY_PASS),
    QTY_FAIL = cumsum(QTY_FAIL),
    PERC_PASS = QTY_PASS/QTY
  ) %>%
  plot_ly() %>%
  add_trace(x = ~MFG_DATE, y = ~QTY_PASS, name = "Pass", type = "bar", yaxis = 'y1') %>%
  add_trace(x = ~MFG_DATE, y = ~QTY_FAIL, name = 'Fail', type = "bar", yaxis = 'y1') %>%
  add_trace(x = ~MFG_DATE, y = ~PERC_PASS, type = 'scatter', mode = 'lines', name = 'Yield %', yaxis = 'y2') %>%
  layout(
    xaxis=list(title='Manufacturing Date'),
    yaxis=list(side='left',title='Volume (Cumulative)',showgrid=FALSE,zeroline=FALSE),
    yaxis2=list(side='right',overlaying="y",title='Yield %',showgrid=FALSE,zeroline=FALSE,tickformat="%"),
    barmode="stack",
    title='Semi-Conductor Production Volume vs. Time (Cumulative)'
    )

Model 1: Logistic Regression

“Logistic Regression” represents a generalized linear model commonly used to fit a single binary response.

For more information about this algorithm, see section “ExtraTrees” here.

Advantages for problem statement:

Disadvantages for problem statement:

# create copy
df1 <- df_stats

# impute NAs in dataframe with column medians
for(col in names(df1)) {
  # feature columns only (they start with "F" and the other digits are numeric)
  if((substring(col,1,1) == "F") && !is.na(as.numeric(substring(col,2)))) {
    df1[is.na(df1[,col]), col] <- median(df1[,col], na.rm = TRUE)
  }
}

# preview dataframe
printWideDataFrame(df1, 20)
##   PASS_FAIL      F0      F1       F2        F3     F4  F5       F6     F7
## 1         1 3030.93 2564.00 2187.733 1411.1265 1.3602 100  97.6133 0.1242
## 2         1 3095.78 2465.14 2230.422 1463.6606 0.8294 100 102.3433 0.1247
## 3         0 2932.61 2559.94 2186.411 1698.0172 1.5102 100  95.4878 0.1241
## 4         1 2988.72 2479.90 2199.033  909.7926 1.3204 100 104.2367 0.1217
## 5         1 3032.24 2502.87 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## 6         1 2946.25 2432.84 2233.367 1326.5200 1.5334 100 100.3967 0.1235
##       F8      F9     F10    F11      F12 F13     F14      F15     F16
## 1 1.5005  0.0162 -0.0034 0.9455 202.4396   0  7.9558 414.8710 10.0433
## 2 1.4966 -0.0005 -0.0148 0.9627 200.5470   0 10.1548 414.7347  9.2599
## 3 1.4436  0.0041  0.0013 0.9615 202.0179   0  9.5157 416.7075  9.3144
## 4 1.4882 -0.0124 -0.0033 0.9629 201.8482   0  9.6052 422.2894  9.6924
## 5 1.5031 -0.0031 -0.0072 0.9569 201.9424   0 10.5661 420.5925 10.3387
## 6 1.5287  0.0167  0.0055 0.9699 200.4720   0  8.6617 414.2426  9.2441
##      F17      F18 SIL_VENDOR_eee SIL_VENDOR_hhh SIL_VENDOR_aaa
## 1 0.9680 192.3963              0              0              0
## 2 0.9701 191.2872              0              0              0
## 3 0.9674 192.7035              1              0              0
## 4 0.9687 192.1557              0              1              0
## 5 0.9735 191.6037              0              0              1
## 6 0.9747 191.2280              0              1              0
##   SIL_VENDOR_ggg SIL_VENDOR_bbb SIL_VENDOR_ccc SIL_VENDOR_iii
## 1              0              0              0              0
## 2              0              0              0              0
## 3              0              0              0              0
## 4              0              0              0              0
## 5              0              0              0              0
## 6              0              0              0              0
##   SIL_VENDOR_fff ADHS_VENDOR_bbb ADHS_VENDOR_aaa SOP_VENDOR_eee
## 1              0               1               0              1
## 2              0               0               1              0
## 3              0               0               1              0
## 4              0               0               1              1
## 5              0               1               0              0
## 6              0               1               0              1
##   SOP_VENDOR_aaa SOP_VENDOR_jjj SOP_VENDOR_iii SOP_VENDOR_ddd
## 1              0              0              0              0
## 2              1              0              0              0
## 3              0              1              0              0
## 4              0              0              0              0
## 5              0              0              1              0
## 6              0              0              0              0
##   SOP_VENDOR_ccc SOP_VENDOR_kkk SOP_VENDOR_hhh SOP_VENDOR_bbb
## 1              0              0              0              0
## 2              0              0              0              0
## 3              0              0              0              0
## 4              0              0              0              0
## 5              0              0              0              0
## 6              0              0              0              0
##   SOP_VENDOR_ggg SOP_VENDOR_fff
## 1              0              0
## 2              0              0
## 3              0              0
## 4              0              0
## 5              0              0
## 6              0              0
# # initialize models
# m1_full = glm(PASS_FAIL ~ ., data=df1, family=binomial(), control = list(maxit = 50))
# m1_null = glm(PASS_FAIL ~ 1, data=df1, family=binomial(), control = list(maxit = 50))
# 
# # down-select variables
# # m1_bwd = step(m1_full, direction="backward"), backward not a good choice for high dimensionality problem
# m1_fwd = step(m1_null, scope=list(lower=m1_null, upper=m1_full), direction="forward")
# m1_both = step(m1_null, scope = list(upper=m1_full), direction="both")
# 
# # compare methods
# if(m1_fwd$aic<m1_both$aic){
#   print("Step forward selection chosen")
#   m1_varModel = m1_fwd
# }else{
#   print("Step both selection chosen")
#   m1_varModel = m1_both
# }
# m1_formula <- m1_varModel$formula
# m1_formula

# # BOTH SELECTED, TODO
m1_formula <- as.formula(
"PASS_FAIL ~ SIL_VENDOR_eee + F103 + F59 + F21 + F73 + F428 +
F569 + F64 + F75 + F129 + F433 + F365 + F9 + F443 + F473 +
F500 + F368 + F488 + SOP_VENDOR_ggg + F411 + F476 + F38 +
F87 + F104 + F484 + F349 + F84 + F72 + F56 + F554 + F131 +
F511 + F545 + F470 + F410 + F419 + F418 + F32 + SIL_VENDOR_ccc +
SOP_VENDOR_aaa + F320 + F66 + F321 + F94 + F132 + F575"
)


m1_base = glm(m1_formula, data=df1, family=binomial(), control = list(maxit = 50))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1_base)
## 
## Call:
## glm(formula = m1_formula, family = binomial(), data = df1, control = list(maxit = 50))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -8.49    0.00    0.00    0.00    8.49  
## 
## Coefficients:
##                  Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)     1.118e+17  9.574e+08  116800273   <2e-16 ***
## SIL_VENDOR_eee -1.860e+15  4.823e+06 -385570582   <2e-16 ***
## F103           -6.645e+15  6.460e+08  -10285437   <2e-16 ***
## F59            -5.886e+13  2.506e+05 -234897325   <2e-16 ***
## F21            -3.362e+11  2.914e+03 -115374448   <2e-16 ***
## F73             2.029e+13  3.648e+05   55628443   <2e-16 ***
## F428            1.586e+13  1.654e+05   95899494   <2e-16 ***
## F569           -2.292e+13  1.853e+05 -123660497   <2e-16 ***
## F64            -7.830e+13  4.308e+05 -181743602   <2e-16 ***
## F75            -1.049e+16  8.162e+07 -128475389   <2e-16 ***
## F129           -1.745e+14  1.553e+06 -112342200   <2e-16 ***
## F433           -5.630e+11  7.974e+03  -70601180   <2e-16 ***
## F365           -8.186e+16  8.225e+08  -99530315   <2e-16 ***
## F9              1.254e+16  1.150e+08  109020153   <2e-16 ***
## F443           -1.769e+15  1.240e+07 -142638391   <2e-16 ***
## F473           -6.100e+12  8.710e+04  -70034373   <2e-16 ***
## F500           -4.088e+11  5.341e+03  -76547229   <2e-16 ***
## F368            1.267e+17  1.051e+09  120583482   <2e-16 ***
## F488            6.391e+11  6.916e+03   92421264   <2e-16 ***
## SOP_VENDOR_ggg  1.388e+14  6.077e+06   22839541   <2e-16 ***
## F411           -1.353e+14  3.194e+06  -42363755   <2e-16 ***
## F476            2.204e+12  1.388e+05   15876370   <2e-16 ***
## F38             1.098e+14  4.245e+06   25853822   <2e-16 ***
## F87            -7.567e+15  1.347e+08  -56161019   <2e-16 ***
## F104            1.794e+17  2.053e+09   87379983   <2e-16 ***
## F484            3.712e+11  8.278e+03   44844919   <2e-16 ***
## F349           -9.179e+15  1.631e+08  -56282408   <2e-16 ***
## F84             2.020e+16  3.411e+08   59217406   <2e-16 ***
## F72            -6.039e+12  3.585e+05  -16846010   <2e-16 ***
## F56            -4.287e+16  2.950e+08 -145330525   <2e-16 ***
## F554           -2.064e+14  3.163e+06  -65255420   <2e-16 ***
## F131           -6.563e+16  7.862e+08  -83486519   <2e-16 ***
## F511           -2.169e+11  5.224e+03  -41515405   <2e-16 ***
## F545            5.685e+13  1.397e+06   40683253   <2e-16 ***
## F470            5.863e+13  4.735e+05  123817519   <2e-16 ***
## F410            2.752e+13  8.189e+05   33607936   <2e-16 ***
## F419           -1.676e+11  5.271e+03  -31803275   <2e-16 ***
## F418            1.247e+11  5.980e+03   20843840   <2e-16 ***
## F32            -8.543e+13  8.762e+05  -97495392   <2e-16 ***
## SIL_VENDOR_ccc -8.954e+14  5.923e+06 -151161347   <2e-16 ***
## SOP_VENDOR_aaa -1.246e+14  5.819e+06  -21415935   <2e-16 ***
## F320           -3.402e+15  7.863e+07  -43265617   <2e-16 ***
## F66            -1.821e+13  2.287e+05  -79618705   <2e-16 ***
## F321            7.536e+13  1.190e+06   63318441   <2e-16 ***
## F94             7.806e+17  1.007e+10   77480257   <2e-16 ***
## F132            1.846e+15  3.585e+07   51494879   <2e-16 ***
## F575            9.544e+14  2.577e+07   37037828   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  765.15  on 1566  degrees of freedom
## Residual deviance: 5406.55  on 1520  degrees of freedom
## AIC: 5500.5
## 
## Number of Fisher Scoring iterations: 46

Model 2: Extremely Random Trees

“ExtraTrees” represents a Random Forest which:

For more information about this algorithm, see section “ExtraTrees” here.

Advantages for problem statement:

Disadvantages for problem statement:

# create copy
df2 <- df_stats

# preview dataframe
printWideDataFrame(df2, 20)
##   PASS_FAIL      F0      F1       F2        F3     F4  F5       F6     F7
## 1         1 3030.93 2564.00 2187.733 1411.1265 1.3602 100  97.6133 0.1242
## 2         1 3095.78 2465.14 2230.422 1463.6606 0.8294 100 102.3433 0.1247
## 3         0 2932.61 2559.94 2186.411 1698.0172 1.5102 100  95.4878 0.1241
## 4         1 2988.72 2479.90 2199.033  909.7926 1.3204 100 104.2367 0.1217
## 5         1 3032.24 2502.87 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## 6         1 2946.25 2432.84 2233.367 1326.5200 1.5334 100 100.3967 0.1235
##       F8      F9     F10    F11      F12 F13     F14      F15     F16
## 1 1.5005  0.0162 -0.0034 0.9455 202.4396   0  7.9558 414.8710 10.0433
## 2 1.4966 -0.0005 -0.0148 0.9627 200.5470   0 10.1548 414.7347  9.2599
## 3 1.4436  0.0041  0.0013 0.9615 202.0179   0  9.5157 416.7075  9.3144
## 4 1.4882 -0.0124 -0.0033 0.9629 201.8482   0  9.6052 422.2894  9.6924
## 5 1.5031 -0.0031 -0.0072 0.9569 201.9424   0 10.5661 420.5925 10.3387
## 6 1.5287  0.0167  0.0055 0.9699 200.4720   0  8.6617 414.2426  9.2441
##      F17      F18 SIL_VENDOR_eee SIL_VENDOR_hhh SIL_VENDOR_aaa
## 1 0.9680 192.3963              0              0              0
## 2 0.9701 191.2872              0              0              0
## 3 0.9674 192.7035              1              0              0
## 4 0.9687 192.1557              0              1              0
## 5 0.9735 191.6037              0              0              1
## 6 0.9747 191.2280              0              1              0
##   SIL_VENDOR_ggg SIL_VENDOR_bbb SIL_VENDOR_ccc SIL_VENDOR_iii
## 1              0              0              0              0
## 2              0              0              0              0
## 3              0              0              0              0
## 4              0              0              0              0
## 5              0              0              0              0
## 6              0              0              0              0
##   SIL_VENDOR_fff ADHS_VENDOR_bbb ADHS_VENDOR_aaa SOP_VENDOR_eee
## 1              0               1               0              1
## 2              0               0               1              0
## 3              0               0               1              0
## 4              0               0               1              1
## 5              0               1               0              0
## 6              0               1               0              1
##   SOP_VENDOR_aaa SOP_VENDOR_jjj SOP_VENDOR_iii SOP_VENDOR_ddd
## 1              0              0              0              0
## 2              1              0              0              0
## 3              0              1              0              0
## 4              0              0              0              0
## 5              0              0              1              0
## 6              0              0              0              0
##   SOP_VENDOR_ccc SOP_VENDOR_kkk SOP_VENDOR_hhh SOP_VENDOR_bbb
## 1              0              0              0              0
## 2              0              0              0              0
## 3              0              0              0              0
## 4              0              0              0              0
## 5              0              0              0              0
## 6              0              0              0              0
##   SOP_VENDOR_ggg SOP_VENDOR_fff
## 1              0              0
## 2              0              0
## 3              0              0
## 4              0              0
## 5              0              0
## 6              0              0
# run model and summarize
m2_base = extraTrees(df2 %>% select(-PASS_FAIL),df2$PASS_FAIL,numRandomCuts=1,na.action="fuse")
m2_base
## ExtraTrees:
##  - # of trees: 500
##  - node size:  5
##  - # of dim:   622
##  - # of tries: 207
##  - type:       numeric (regression)
##  - multi-task: no
# extraTrees does not have a variable importance function

Cross Validation

Perform 5 fold cross validation for both models and generate ROC graphs here.

# create copy
df <- df_stats

n = 5  # number of folds
df = df[sample(nrow(df)),]  # Randomly shuffle the data
folds = cut(seq(1,nrow(df)),breaks=n,labels=FALSE)  # Create 5 equally size folds

# create empty matrix for accuracy and precision
accuracy = as.data.frame(matrix(data=NA,nrow=n,ncol=2))
precision = as.data.frame(matrix(data=NA,nrow=n,ncol=2))
names(accuracy) = c("m1","m2")
names(precision) = c("m1","m2")
cutoff = 0.50

pdf(file='./docs/ROCs.pdf',width=10,height=7.5)  # begin pdf writer

# Perform 5 fold cross validation
for(i in 1:n){
  # Segment the data by fold using the which() function 
  testIndexes = which(folds==i,arr.ind=TRUE)
  testData = df[testIndexes, ]
  trainData = df[-testIndexes, ]
  
  # model 1: logistic regression  
  m1 = glm(m1_formula,data=trainData,family='binomial',control=list(maxit=50))
  p1 = predict(m1,newdata=testData,type='response')
  pr1 = prediction(p1,testData$PASS_FAIL)
  prf1 = performance(pr1,measure="tpr",x.measure="fpr")
  prec1 = performance(pr1,measure="prec")
  acc1 = performance(pr1,measure="acc")
  auc1 = performance(pr1,measure="auc")
  
  # model 2: extremely random forest
  m2 = extraTrees(trainData %>% select(-PASS_FAIL),trainData$PASS_FAIL,numRandomCuts=1,na.action="fuse")
  p2 = predict(m2,testData %>% select(-PASS_FAIL))
  pr2 = prediction(p2,testData$PASS_FAIL)
  prf2 = performance(pr2,measure="tpr",x.measure="fpr")
  prec2 = performance(pr2,measure="prec")
  acc2 = performance(pr2,measure="acc")
  auc2 = performance(pr2,measure="auc")
  
  # graph results
  par(pty="s")
  plot(prf1,main=paste('ROC: Fold ',i,sep=''),xaxs='i',yaxs='i',asp=1)
  lines(prf2@x.values[[1]],prf2@y.values[[1]],col='red')
  abline(a=0,b=1,lty=2)
  legend('bottomright',
         c(paste('Model 1 | AUC=',format(round(auc1@y.values[[1]],3),3),sep=''),
           paste('Model 2 | AUC=',format(round(auc2@y.values[[1]],3),3),sep='')),
         col=c('black','red'),lty=c(1,1))

  par(pty="m")
  plot(prec1,main=paste('Precision: Fold ',i,sep=''),ylim=c(0.4,1))
  lines(prec2@x.values[[1]],prec2@y.values[[1]],col='red')
  abline(v=0.5,lty=2)
  legend('topleft',c('Model 1','Model 2'),col=c('black','red'),lty=c(1,1))

  plot(acc1,main=paste('Accuracy: Fold ',i,sep=''),ylim=c(0.4,1))
  lines(acc2@x.values[[1]],acc2@y.values[[1]],col='red')
  abline(v=0.5,lty=2)
  legend('topleft',c('Model 1','Model 2'),col=c('black','red'),lty=c(1,1))

  accuracy$m1[i] = acc1@y.values[[1]][max(which(acc1@x.values[[1]]>=cutoff))]
  accuracy$m2[i] = acc2@y.values[[1]][max(which(acc2@x.values[[1]]>=cutoff))]

  precision$m1[i] = prec1@y.values[[1]][max(which(prec1@x.values[[1]]>=cutoff))]
  precision$m2[i] = prec2@y.values[[1]][max(which(prec2@x.values[[1]]>=cutoff))]
  
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
invisible(dev.off())  # close pdf writer

Comparing Models

A simple t-test can be used to compare models on accuracy and precision results.

# defined as null hypothesis: m1-m2=0
accuracy_test = t.test(accuracy$m1,accuracy$m2,conf.level=0.95,paired=T)
precision_test = t.test(precision$m1,precision$m2,conf.level=0.95,paired=T)

accuracy
##          m1        m2
## 1 0.9090909 0.9483395
## 2 0.9583333 0.9628253
## 3 0.9252336 0.9200000
## 4 0.9166667 0.9625468
## 5 0.9294118 0.9267516
accuracy_test
## 
##  Paired t-test
## 
## data:  accuracy$m1 and accuracy$m2
## t = -1.5034, df = 4, p-value = 0.2072
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04653229  0.01384155
## sample estimates:
## mean of the differences 
##             -0.01634537
if(accuracy_test$p.value>0.05){
  print("Model 1 and Model 2 accuracies are not significantly different.")
}else if(mean(accuracy$m1)>mean(accuracy$m2)){
  print("Model 1 is statistically more accurate than Model 2.")
}else{
  print("Model 2 is statistically more accurate than Model 1.")
}
## [1] "Model 1 and Model 2 accuracies are not significantly different."
precision
##          m1        m2
## 1 0.9772727 0.9618321
## 2 1.0000000 0.9651163
## 3 0.9787234 0.9453125
## 4 0.9340659 0.9680000
## 5 0.9733333 0.9621993
precision_test
## 
##  Paired t-test
## 
## data:  precision$m1 and precision$m2
## t = 0.97819, df = 4, p-value = 0.3834
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02240418  0.04677827
## sample estimates:
## mean of the differences 
##              0.01218705
if(precision_test$p.value>0.05){
  print("Model 1 and Model 2 precisions are not significantly different.")
}else if(mean(precision$m1)>mean(precision$m2)){
  print("Model 1 is statistically more precise than Model 2.")
}else{
  print("Model 2 is statistically more precise than Model 1.")
}
## [1] "Model 1 and Model 2 precisions are not significantly different."

Important Variables

The important variables to be exported are from Model 1. With neither model being significantly different from one another in terms of accuracy or precision, Model 1 variables were chosen due to the lack of documentation for the “ExtraTrees” package on how to extract these variables in Model 2.

The time series graphs of the important variables can be found here.

options(warn=-1)
pdf(file='./docs/ImportantVariables.pdf',width=10,height=7.5)  # begin pdf writer
for(name in sort(names(m1_base$coefficients)[-1])){
  p <- df_orig %>%
    fastDummies::dummy_cols() %>%  # add dummy variables for all string columns
    mutate(PASS_FAIL = as.factor(ifelse(PASS_FAIL==1,"PASS","FAIL"))) %>%
    ggplot(aes_string(x = "MFG_DATE", y = name)) +
    geom_point(alpha = 0.4, aes(colour = PASS_FAIL)) +
    scale_color_manual(values = c("FAIL" = "red", "PASS" = "black")) +
    guides(colour = guide_legend(override.aes = list(alpha = 1, size = 2))) +
    ggtitle(paste("Variable:",name)) +
    xlab("Manufacturing Date") +
    ylab("Value")
  print(p)
}
invisible(dev.off())
options(warn=0)

Future Work